I, [Yanpu Huang], confirm that the work presented in this assessment is my own. Where information has been derived from other sources, I confirm that this has been indicated in the work.
date: 20 December, 2021
library(tidyverse)
library(tmap)
library(rgdal)
library(broom)
library(mapview)
library(crosstalk)
library(sf)
library(sp)
library(spdep)
library(car)
library(fs)
library(janitor)
library(here)
library(landsat)
library(broom)
library(tidypredict)
library(spatstat)
library(here)
library(sp)
library(rgeos)
library(maptools)
library(GISTools)
library(geojson)
library(geojsonio)
library(tmaptools)
library(spatialreg)
library(spgwr)
year_borough_grocery.csv: https://figshare.com/articles/dataset/Area-level_grocery_purchases/7796666?file=18848321 london_obesity_borough_2012.csv: https://figshare.com/articles/dataset/Validation_data_obesity_diabetes_/7796672?file=14510789
London_Borough_Excluding_MHW.shp: https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london
Read the datasets from “data” file.
LondonBoroughs <- st_read(here::here("data", "statistical-gis-boundaries-london", "ESRI", "London_Borough_Excluding_MHW.shp")) %>%
st_transform(., 27700) %>%
clean_names()
## Reading layer `London_Borough_Excluding_MHW' from data source
## `/Users/calvinhuangk/Desktop/CASA/modules/GIS/casa0005-final-exam-HuangK1227/data/statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 33 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 503568.2 ymin: 155850.8 xmax: 561957.5 ymax: 200933.9
## Projected CRS: OSGB 1936 / British National Grid
obesity <- read_csv(here::here("data", "london_obesity_borough_2012.csv")) %>%
clean_names()
## Rows: 33 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): oslaua
## dbl (4): f_healthy_weight, f_overweight, f_obese, weighted_sample
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grocery <- read_csv(here::here("data", "year_borough_grocery.csv")) %>%
clean_names()
## Rows: 33 Columns: 202
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): area_id
## dbl (201): weight, weight_perc2.5, weight_perc25, weight_perc50, weight_perc...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# msoa <- read_csv(here::here("data", "msoa-data.csv")) %>%
# clean_names()
lb_profile1 <- LondonBoroughs%>%
merge(.,
obesity,
by.x="gss_code",
by.y="oslaua",
no.dups = TRUE)%>%
distinct(.,gss_code,
.keep_all = TRUE)
lb_profile2 <- lb_profile1%>%
merge(.,
grocery,
by.x="gss_code",
by.y="area_id",
no.dups = TRUE)%>%
distinct(.,gss_code,
.keep_all = TRUE) %>%
mutate(total_fat = fat*num_transactions)%>%
mutate(total_carb = carb*num_transactions)%>%
mutate(total_sugar = sugar*num_transactions)%>%
mutate(total_salt = salt*num_transactions)%>%
mutate(total_protein = protein*num_transactions)%>%
mutate(total_saturate = saturate*num_transactions)%>%
mutate(total_fibre = fibre*num_transactions)%>%
mutate(total_alcohol = alcohol*num_transactions)
lb_profile3 <- lb_profile2%>%
subset(., select = c(gss_code, name, f_obese, total_fat, total_carb, total_sugar, total_salt, total_protein, total_saturate, total_fibre, total_alcohol, fat, sugar, saturate, carb, fibre, protein, salt, alcohol))
Make the basic obesity rate map in London
tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
tm_polygons("f_obese",
style="jenks",
palette="PuOr",
midpoint=NA,
title="obesity_rate")
We could see that the south-western of London has high obesity rate
Make the coordinates data prepared for Moran’s I test and other tests, to see the spatial auto-correlation of obesity rate
coords_lb <- lb_profile3%>%
st_centroid()%>%
st_geometry()
plot(coords_lb,axes=TRUE)
lb_nb <- lb_profile3 %>%
poly2nb(., queen=T)
summary(lb_nb)
## Neighbour list object:
## Number of regions: 33
## Number of nonzero links: 136
## Percentage nonzero weights: 12.48852
## Average number of links: 4.121212
## Link number distribution:
##
## 2 3 4 5 6 7
## 2 10 9 7 4 1
## 2 least connected regions:
## 4 16 with 2 links
## 1 most connected region:
## 5 with 7 links
#plot them
plot(lb_nb, st_geometry(coords_lb), col="red")
#add a map underneath
plot(lb_profile3$geometry, add=T)
lb.lw <- lb_nb %>%
nb2mat(., style="B")
sum(lb.lw)
## [1] 136
sum(lb.lw[,1])
## [1] 5
lb.lw <- lb_nb %>%
nb2listw(., style="C")
lb.lw
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 33
## Number of nonzero links: 136
## Percentage nonzero weights: 12.48852
## Average number of links: 4.121212
##
## Weights style: C
## Weights constants summary:
## n nn S0 S1 S2
## C 33 1089 33 16.01471 143.6613
Do the several spatial model tests
i_lb_global_obesity <- lb_profile3 %>%
pull(f_obese) %>%
as.vector()%>%
moran.test(., lb.lw)
i_lb_global_obesity
##
## Moran I test under randomisation
##
## data: .
## weights: lb.lw
##
## Moran I statistic standard deviate = 0.73232, p-value = 0.232
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.05038149 -0.03125000 0.01242566
c_lb_global_obesity <- lb_profile3 %>%
pull(f_obese) %>%
as.vector()%>%
geary.test(., lb.lw)
c_lb_global_obesity
##
## Geary C test under randomisation
##
## data: .
## weights: lb.lw
##
## Geary C statistic standard deviate = 1.1834, p-value = 0.1183
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic Expectation Variance
## 0.8433236 1.0000000 0.0175270
g_lb_global_obesity <- lb_profile3 %>%
pull(f_obese) %>%
as.vector()%>%
globalG.test(., lb.lw)
g_lb_global_obesity
##
## Getis-Ord global G statistic
##
## data: .
## weights: lb.lw
##
## standard deviate = -1.0102, p-value = 0.8438
## alternative hypothesis: greater
## sample estimates:
## Global G statistic Expectation Variance
## 3.040045e-02 3.125000e-02 7.072561e-07
So we could say the spatial auto-correlation of obesity rate in London is very slightly positive.
Then focus on local moran
I_lb_Local_obesity <- lb_profile3 %>%
pull(f_obese) %>%
as.vector()%>%
localmoran(., lb.lw)%>%
as_tibble()
slice_head(I_lb_Local_obesity, n=5)
## # A tibble: 5 × 5
## Ii E.Ii Var.Ii Z.Ii `Pr(z != E(Ii))`
## <localmrn> <localmrn> <localmrn> <localmrn> <localmrn>
## 1 -2.030943880 -0.2120364582 1.220328862 -1.64653950 0.09965273
## 2 0.406556599 -0.1316443244 0.807781023 0.59882182 0.54929171
## 3 0.001638183 -0.0007847383 0.005469339 0.03276211 0.97386429
## 4 0.168347659 -0.0064720866 0.049483675 0.78588592 0.43193432
## 5 0.067983680 -0.0001904077 0.001229428 1.94432019 0.05185685
lb_profile3 <- lb_profile3 %>%
mutate(count_I = as.numeric(I_lb_Local_obesity$Ii))%>%
mutate(count_Iz =as.numeric(I_lb_Local_obesity$Z.Ii))%>%
mutate(density_I =as.numeric(I_lb_Local_obesity$Ii))%>%
mutate(density_Iz =as.numeric(I_lb_Local_obesity$Z.Ii))
Plot the local moran z score map
breaks1<-c(-1000,-2.58,-1.96,-1.65,1.65,1.96,2.58,1000)
MoranColours<- rev(brewer.pal(8, "RdGy"))
tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
tm_polygons("count_Iz",
style="fixed",
breaks=breaks1,
palette=MoranColours,
midpoint=NA,
title="Local Moran's I z score, adult obesity rate in London boroughs")
Plot the local Gi* map
Gi_lb_Local_Obesity <- lb_profile3 %>%
pull(f_obese) %>%
as.vector()%>%
localG(., lb.lw)
head(Gi_lb_Local_Obesity)
## [1] -1.64653950 0.59882182 0.03276211 0.78588592 -1.94432019 0.98516705
lb_profile3 <- lb_profile3 %>%
mutate(obesity_G = as.numeric(Gi_lb_Local_Obesity))
GIColours<- rev(brewer.pal(8, "RdBu"))
tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
tm_polygons("obesity_G",
style="fixed",
breaks=breaks1,
palette=GIColours,
midpoint=NA,
title="local Gi*, Adult obesity in London boroughs ")
Now we gonna make the analysis by models and data visualisation qplot to see the linear regression’s fitted line of raw data
q1 <- qplot(x = `total_fat`,
y = `f_obese`,
data=lb_profile3)
q1 + stat_smooth(method="lm", se=FALSE, size=1) +
geom_jitter()
## `geom_smooth()` using formula 'y ~ x'
q2 <- qplot(x = `total_sugar`,
y = `f_obese`,
data=lb_profile3)
q2 + stat_smooth(method="lm", se=FALSE, size=1) +
geom_jitter()
## `geom_smooth()` using formula 'y ~ x'
q3 <- qplot(x = `total_saturate`,
y = `f_obese`,
data=lb_profile3)
q3 + stat_smooth(method="lm", se=FALSE, size=1) +
geom_jitter()
## `geom_smooth()` using formula 'y ~ x'
q4 <- qplot(x = `total_carb`,
y = `f_obese`,
data=lb_profile3)
q4 + stat_smooth(method="lm", se=FALSE, size=1) +
geom_jitter()
## `geom_smooth()` using formula 'y ~ x'
q5 <- qplot(x = `total_fibre`,
y = `f_obese`,
data=lb_profile3)
q5 + stat_smooth(method="lm", se=FALSE, size=1) +
geom_jitter()
## `geom_smooth()` using formula 'y ~ x'
Distribution seems like not following the normal distribution, so we need to check it in next the step
Make distribution histogram to check if it is follow normal distribution.
#dependent variable
ggplot(lb_profile3, aes(x=f_obese)) +
geom_histogram(aes(y = ..density..))+
geom_density(colour="red",
size=1,
adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#independent variables
ggplot(lb_profile3, aes(x=total_fat)) +
geom_histogram(aes(y = ..density..))+
geom_density(colour="red",
size=1,
adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_sugar)) +
geom_histogram(aes(y = ..density..))+
geom_density(colour="red",
size=1,
adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_saturate)) +
geom_histogram(aes(y = ..density..))+
geom_density(colour="red",
size=1,
adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_carb)) +
geom_histogram(aes(y = ..density..))+
geom_density(colour="red",
size=1,
adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_fibre)) +
geom_histogram(aes(y = ..density..))+
geom_density(colour="red",
size=1,
adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=fat)) +
geom_histogram(aes(y = ..density..))+
geom_density(colour="red",
size=1,
adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most independents unfollow the normal distribution Do the log transform or exponential transform, to fit log-normal distribution or exponential-normal distribution
Use box-plot to see if we should select log or exponential
symbox(~f_obese,
lb_profile3,
na.rm=T,
powers=seq(-3,3,by=.5))
symbox(~total_fat,
lb_profile3,
na.rm=T,
powers=seq(-3,3,by=.5))
symbox(~total_sugar,
lb_profile3,
na.rm=T,
powers=seq(-3,3,by=.5))
symbox(~total_saturate,
lb_profile3,
na.rm=T,
powers=seq(-3,3,by=.5))
symbox(~total_carb,
lb_profile3,
na.rm=T,
powers=seq(-3,3,by=.5))
symbox(~total_fibre,
lb_profile3,
na.rm=T,
powers=seq(-3,3,by=.5))
symbox(~carb,
lb_profile3,
na.rm=T,
powers=seq(-3,3,by=.5))
According to box-plot, we could find that exponential(^0.5) is a suitable transform for out data to fit normal distribution
ggplot(lb_profile3, aes(x=f_obese^0.5)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_fat^0.5)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_sugar^0.5)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_saturate^0.5)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_carb^0.5)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lb_profile3, aes(x=total_fibre^0.5)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the variables we make the distribution transforming, and do the ols regression model, compare r-square, p-value, vif-value, coefficient and find the best model.
model1 <- lb_profile3%>%
lm(f_obese ~ total_fat+total_sugar+total_saturate+total_carb+total_fibre, data = .)
tidy(model1)
## # A tibble: 6 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 21.3 1.36 15.6 4.90e-15
## 2 total_fat -0.000000488 0.000000379 -1.29 2.08e- 1
## 3 total_sugar -0.0000000294 0.000000214 -0.137 8.92e- 1
## 4 total_saturate -0.000000169 0.000000840 -0.201 8.42e- 1
## 5 total_carb 0.000000236 0.000000115 2.05 4.99e- 2
## 6 total_fibre 0.000000580 0.000000637 0.910 3.71e- 1
glance(model1)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.314 0.187 4.49 2.47 0.0572 5 -93.1 200. 211.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
vif(model1)
## total_fat total_sugar total_saturate total_carb total_fibre
## 4150.0884 1597.1585 3197.4571 1581.4060 383.2046
The p-value is too high in model1 Also vif value is high too.
model2 <- lb_profile3%>%
lm(I(f_obese^0.5) ~ I(total_fat^0.5)+ I(total_sugar^0.5) + I(total_saturate^0.5)+ I(total_carb^0.5)+I(total_fibre^0.5), data = .)
tidy(model2)
## # A tibble: 6 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.75 0.254 18.7 5.50e-17
## 2 I(total_fat^0.5) -0.00182 0.00121 -1.51 1.43e- 1
## 3 I(total_sugar^0.5) -0.000367 0.000727 -0.504 6.18e- 1
## 4 I(total_saturate^0.5) 0.000251 0.00159 0.158 8.76e- 1
## 5 I(total_carb^0.5) 0.00125 0.000537 2.33 2.77e- 2
## 6 I(total_fibre^0.5) 0.000595 0.000971 0.613 5.45e- 1
glance(model2)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.413 0.305 0.467 3.81 0.00973 5 -18.4 50.7 61.2
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
vif(model2)
## I(total_fat^0.5) I(total_sugar^0.5) I(total_saturate^0.5)
## 6610.2402 2584.0167 4546.0260
## I(total_carb^0.5) I(total_fibre^0.5)
## 2655.7883 764.7656
The vif value is too high in model2, but the number_transactions’ multiple should be concerned about, because it is using total number of each nutrient
model3 <- lb_profile3%>%
lm(f_obese^0.5 ~ fat+sugar+saturate+carb+fibre, data = .)
tidy(model3)
## # A tibble: 6 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 13.2 5.06 2.60 0.0149
## 2 fat -1.59 0.919 -1.73 0.0942
## 3 sugar 0.499 0.347 1.44 0.162
## 4 saturate 0.465 1.65 0.282 0.780
## 5 carb 0.162 0.212 0.765 0.451
## 6 fibre -2.47 2.74 -0.903 0.374
glance(model3)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.421 0.314 0.463 3.93 0.00830 5 -18.1 50.3 60.7
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
vif(model3)
## fat sugar saturate carb fibre
## 7.009292 9.772034 3.599163 9.818239 2.286981
Low p value and nice r-square perfomed in model3, and the vif value is lower than 10 too, so means low correlation and multicolinearity
Make the residuals distributino histogram, check if they are following the normal distribution
model_data1 <- model1 %>%
augment(., lb_profile3)
model_data2 <- model2 %>%
augment(., lb_profile3)
model_data3 <- model3 %>%
augment(., lb_profile3)
#plot residuals
model_data1%>%
dplyr::select(.resid)%>%
pull()%>%
qplot()+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model_data2%>%
dplyr::select(.resid)%>%
pull()%>%
qplot()+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model_data3%>%
dplyr::select(.resid)%>%
pull()%>%
qplot()+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
They seems like all fit normal distribution
Make the residuals plot, to check the homoscedasticity
par(mfrow=c(2,2)) #plot to 2 by 2 array
plot(model1)
par(mfrow=c(2,2)) #plot to 2 by 2 array
plot(model2)
par(mfrow=c(2,2)) #plot to 2 by 2 array
plot(model3)
Now we need to focus on the influence of spatial auto-correlation of model3, and consider about doing the spatial regression to move the influence of spatial correlation
DW <- durbinWatsonTest(model3)
tidy(DW)
## # A tibble: 1 × 5
## statistic p.value autocorrelation method alternative
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 1.17 0.0120 0.289 Durbin-Watson Test two.sided
DW test(<2) shows that there is a little positive auto-spatial correlation, let’s check the residuals map of model3 Add data of residuals of model3
lb_profile3 <- lb_profile3 %>%
mutate(model3resids = residuals(model3)) %>%
mutate(model2resids = residuals(model2))
Plot the residuals map of model3
tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
tm_polygons("model2resids",
palette = "RdYlBu")
## Variable(s) "model2resids" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
tm_polygons("model3resids",
palette = "RdYlBu")
## Variable(s) "model3resids" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
Same colors shows the spatial correlation roughly, but we need Moran’s I test to check more details.
Make the queen neighbors lists/ list.weight and k nearest neighbors lists/list.weight.
coordsW <- lb_profile3%>%
st_centroid()%>%
st_geometry()
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
lb_qnb <- lb_profile3 %>%
poly2nb(., queen=T)
#or nearest neighbours
lb_knn <-coordsW %>%
knearneigh(., k=4) %>%
knn2nb()
plot(lb_qnb, st_geometry(coordsW), col="red")
plot(lb_knn, st_geometry(coordsW), col="red")
lb.queens_weight <- lb_qnb %>%
nb2listw(., style="W")
lb.knn_4_weight <- lb_knn %>%
nb2listw(., style="W")
Use Moran I’s test to check the spatial auto-correlation of the residuals of model3 more precisely.
Queen_model3 <- lb_profile3 %>%
st_drop_geometry()%>%
dplyr::select(model3resids)%>%
pull()%>%
moran.test(., lb.queens_weight)%>%
tidy()
K_Nearest_neighbour_model3 <- lb_profile3 %>%
st_drop_geometry()%>%
dplyr::select(model3resids)%>%
pull()%>%
moran.test(., lb.knn_4_weight)%>%
tidy()
Queen_model3
## # A tibble: 1 × 7
## estimate1 estimate2 estimate3 statistic p.value method alternative
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 -0.0485 -0.0312 0.0131 -0.151 0.560 Moran I test unde… greater
K_Nearest_neighbour_model3
## # A tibble: 1 × 7
## estimate1 estimate2 estimate3 statistic p.value method alternative
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 -0.132 -0.0312 0.0107 -0.976 0.835 Moran I test unde… greater
We can now make sure low negative-spatial auto-correlation exists, then I would like using Lagrange Multiplier (LM) test to select a suitable model between the spatial Lag model and the spatial error model for same independent and dependent in model3. If the LM test does not give a suitable output, I will change to use GWR model.
lb.queens_weight_ROW_model3 <- lb_qnb %>%
nb2listw(., style="W")
lm.LMtests(model3, lb.queens_weight_ROW_model3, test = c("LMerr","LMlag","RLMerr","RLMlag","SARMA"))
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
##
## LMerr = 0.15199, df = 1, p-value = 0.6966
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
##
## LMlag = 0.069299, df = 1, p-value = 0.7924
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
##
## RLMerr = 0.12682, df = 1, p-value = 0.7218
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
##
## RLMlag = 0.044129, df = 1, p-value = 0.8336
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
##
## SARMA = 0.19612, df = 2, p-value = 0.9066
Seems like everything’s p-value is high, so I would like to do a GWR model to see the coefficient of each nutrients in boroughs.
Now making a Geographically Weighted Regression Model.
coordsW2 <- st_coordinates(coordsW)
lb_profile4 <- cbind(lb_profile3,coordsW2)
# GWRbandwidth <- gwr.sel(f_obese^0.5 ~ fat+sugar+saturate+carb+fibre,
# data = lb_profile4,
# data = lb_profile4,
# coords=cbind(lb_profile4$X, lb_profile4$Y),
# adapt=T)
# errors on finding the best bandwidth, just random pick one
gwr.model = gwr(f_obese^0.5 ~ fat+sugar+saturate+carb+fibre,
data = lb_profile4,
coords=cbind(lb_profile4$X, lb_profile4$Y),
adapt=0.09016994,#if i cant find a bedt bandwidth, then set a random number
#matrix output
hatmatrix=TRUE,
#standard error
se.fit=TRUE)
#print the results of the model
gwr.model
## Call:
## gwr(formula = f_obese^0.5 ~ fat + sugar + saturate + carb + fibre,
## data = lb_profile4, coords = cbind(lb_profile4$X, lb_profile4$Y),
## adapt = 0.09016994, hatmatrix = TRUE, se.fit = TRUE)
## Kernel function: gwr.Gauss
## Adaptive quantile: 0.09016994 (about 2 of 33 data points)
## Summary of GWR coefficient estimates at data points:
## Min. 1st Qu. Median 3rd Qu. Max. Global
## X.Intercept. 6.104625 12.111983 16.409629 21.155350 43.880196 13.1524
## fat -5.645676 -2.653788 -2.038285 -1.442799 -0.824777 -1.5944
## sugar -0.031909 0.448531 0.669031 0.945537 1.637667 0.4993
## saturate -5.176539 -0.445703 0.177993 1.965549 5.703637 0.4648
## carb -0.399558 -0.046645 0.080902 0.227878 0.898984 0.1624
## fibre -9.290865 -3.577115 -1.935203 -0.492981 1.609352 -2.4737
## Number of data points: 33
## Effective number of parameters (residual: 2traceS - traceS'S): 23.31585
## Effective degrees of freedom (residual: 2traceS - traceS'S): 9.684152
## Sigma (residual: 2traceS - traceS'S): 0.4821393
## Effective number of parameters (model: traceS): 18.7869
## Effective degrees of freedom (model: traceS): 14.2131
## Sigma (model: traceS): 0.3979778
## Sigma (ML): 0.2611839
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 111.972
## AIC (GWR p. 96, eq. 4.22): 23.82982
## Residual sum of squares: 2.251161
## Quasi-global R2: 0.7753594
results <- as.data.frame(gwr.model$SDF)
names(results)
## [1] "sum.w" "X.Intercept." "fat"
## [4] "sugar" "saturate" "carb"
## [7] "fibre" "X.Intercept._se" "fat_se"
## [10] "sugar_se" "saturate_se" "carb_se"
## [13] "fibre_se" "gwr.e" "pred"
## [16] "pred.se" "localR2" "X.Intercept._se_EDF"
## [19] "fat_se_EDF" "sugar_se_EDF" "saturate_se_EDF"
## [22] "carb_se_EDF" "fibre_se_EDF" "pred.se.1"
## [25] "coord.x" "coord.y"
Plot the coefficient of these 5 variables calculated by gwr model in map.
lb_profile4 <- lb_profile4 %>%
mutate(coef_fat = results$fat,
coef_sugar = results$sugar,
coef_saturate = results$saturate,
coef_carb = results$carb,
coef_fibre = results$fibre,)
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
tm_polygons(col = "coef_fat",
palette = "RdBu",
alpha = 0.5)
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
tm_polygons(col = "coef_sugar",
palette = "RdBu",
alpha = 0.5)
## Variable(s) "coef_sugar" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
tm_polygons(col = "coef_saturate",
palette = "RdBu",
alpha = 0.5)
## Variable(s) "coef_saturate" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
tm_polygons(col = "coef_carb",
palette = "RdBu",
alpha = 0.5)
## Variable(s) "coef_carb" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
tm_polygons(col = "coef_fibre",
palette = "RdBu",
alpha = 0.5)
## Variable(s) "coef_fibre" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
We can see here from the gwr map of how obesity be affected by several average nutrients in the food in each boroughs. - Fat in food has negative correlation to obesity in every boroughs. - Sugar in food has positive correlation to obesity in nearly every boroughs. - Saturate in food has positive correlation to obesity in nearly the south, east, west part of London, excep the north part. - Carb in food has positive correlation to obesity in south, east, west part of London. - Fibre in food has nearly negative correlation to obesity in every boroughs.
Plot the sigtest(like residuals) of variables(under 18) by map
# sigTest1 = abs(gwr.model$SDF$"fat")-2 * gwr.model$SDF$"fat_se"
# sigTest2 = abs(gwr.model$SDF$"sugar")-2 * gwr.model$SDF$"sugar_se"
# sigTest3 = abs(gwr.model$SDF$"saturate")-2 * gwr.model$SDF$"saturate_se"
# sigTest4 = abs(gwr.model$SDF$"carb")-2 * gwr.model$SDF$"carb_se"
# sigTest5 = abs(gwr.model$SDF$"fibre")-2 * gwr.model$SDF$"fibre_se"
# lb_profile4 <- lb_profile4 %>%
# mutate(fat_sig = sigTest1) %>%
# mutate(sugar_sig = sigTest2) %>%
# mutate(saturate_sig = sigTest3) %>%
# mutate(carb_sig = sigTest4) %>%
# mutate(fibre_sig = sigTest5) %>%
#
#
# tmap_mode("view")
#
# tm_shape(lb_profile4) +
# tm_polygons(col = "fat_sig",
# palette = "RdYlBu")
#
# tmap_mode("view")
#
# tm_shape(lb_profile4) +
# tm_polygons(col = "sugar_sig",
# palette = "RdYlBu")
No time to fix the sig test, and mention about the conclusion about spatial pattern. If sig test comes out, we can make sure the right coefficient with high significance.
The sugar, saturate, carb in food has positive correlation to the adult obesity in most boroughs in London.In contract, the fibre, fat in food has negative correlation to the adult obesity. So the advice for government is to promoting for the food that contaitn high fibre, and appropriate healthy fat.